library(tidyverse)
library(RMySQL)
library(lubridate)
Front matter June 3, 2020 at 4:59PM.
Name your files applied_ps_3.Rmd and applied_ps_3.html. (5 pts.)
Follow the style guide (10 pts.)
This submission is our work alone and complies with the 30535 integrity policy.
Add your initials to indicate your agreement: JCI
Add names of anyone you discussed this problem set with: **__**
Submit by pushing your code to your repo on Github Classroom: https://classroom.github.com/g/5vDiXToZ.
Late coins used this pset: X. Late coins left: X.
waze data
You can find the waze data dictionary here.
At the start of the course that you agreed to follow these data usage terms. Here are the most important parts:
Prelim questions
Working with data on a server adds a challenge as you have to make calls to the database which take time to process. A call to the database can be slow for several reasons.
We can adjust for 1 and 2 by testing our code on small subsections of the data.
Next week we will provide an opt out where you can use csv we provide. Using this option will result in a 10 percent discount on your problem set final grade. For example, if you earn \(90\) pts based on your solutions, your final grade will be \(90 \cdot .9 = 81\).
filter() to reduce the amount of data you pull while exploring data. For example, you can filter by time and location to only get data for a small part of the city and/or over a short time period.collect() a small sample data set so that the you have data in memory on your computer.collect() the entire data set each time you want to work with it.I would like to see Waze predict when draw bridges will raise and lower to allow ships to pass under. This is a major problem for people who have to commute over those bridges because it’s so unpredictable. It can easily add 20+ minutes to a commute. As is, Waze doesn’t know that bridges will raise until traffic has stopped and Wazers report it. But if Waze could cooperate with the authorities that control bridges (including cities, states, and the army corps of engineers) they could warn drivers in advance of these delays, allowing users to seek alternate routes. The challenge for adding this new variable would be coordinating with these authorities to generate and share the data; there are likely to be many agencies involved in maintaining these bridges, and some of them may not have the infrastructure that would be needed to report this data. But another approach might be to use live marine traffic data like https://www.marinetraffic.com/ to “guess” when a bridge might need to be raised to allow a ship under.
Read up on the ggmap package, which will be useful for doing these problems. Particularly, get to know the get_stamenmap() function. If you find yourself downloading 1000s of tiles, check your settings. You are welcome to try using google basemaps as well; while free for new users, this will require a credit card. The version of ggmap on CRAN is out of date, instead find and install it from github.
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(RMySQL)
library(stringr)
connection <- DBI::dbConnect(RMySQL::MySQL(),
user = "ppha30531",
dbname = "Waze2",
port = 3306,
password = "bUYjwnKXf49M2pb",
host = "uchicagowazereplica2.cfykgneqoh8w.us-west-2.rds.amazonaws.com"
)
DBI::dbListTables(connection)
## [1] "chiAlerts"
chi_alerts_sql <- tbl(connection, "chiAlerts")
## Warning in .local(conn, statement, ...): Decimal MySQL column 12 imported
## as numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 13 imported
## as numeric
# Y bounds: 41.935, 41.896 (W George St to W Chicago Ave
event_data <- chi_alerts_sql %>%
filter(
str_detect(street, "N Western Ave"),
41.896 <= location_y & location_y <= 41.934,
type == "ACCIDENT"
) %>%
collect()
## Warning in .local(conn, statement, ...): Decimal MySQL column 12 imported
## as numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 13 imported
## as numeric
saveRDS(event_data, file = "event_data.rds")
event_data <- readRDS(file = "event_data.rds")
dbListConnections(MySQL())
## [[1]]
## <MySQLConnection:0,0>
lapply(dbListConnections(MySQL()), function(x) dbDisconnect(x))
## [[1]]
## [1] TRUE
event_data %>%
group_by(street) %>%
summarize(n())
## # A tibble: 1 x 2
## street `n()`
## <chr> <int>
## 1 N Western Ave 315
corridor_7 <- c(
left = -87.7, bottom = 41.899, right =
-87.67, top = 41.934
)
corridor_7_stamenmap <- get_stamenmap(
data = event_data,
bbox = corridor_7,
zoom = 15,
maptype = "toner-lite"
)
## Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
corridor_7_map <- ggmap(corridor_7_stamenmap,
base_layer = ggplot(data = event_data)
)
corridor_7_map
corridor_7_map +
geom_point(aes(location_x, location_y), color = "red")
## Warning: Removed 12 rows containing missing values (geom_point).
connection <- DBI::dbConnect(RMySQL::MySQL(),
user = "ppha30531",
dbname = "Waze2",
port = 3306,
password = "bUYjwnKXf49M2pb",
host = "uchicagowazereplica2.cfykgneqoh8w.us-west-2.rds.amazonaws.com"
)
chi_alerts_sql <- tbl(connection, "chiAlerts")
## Warning in .local(conn, statement, ...): Decimal MySQL column 12 imported
## as numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 13 imported
## as numeric
event_data <- chi_alerts_sql %>%
filter(
street %in% c(
"E 79th St",
"W 79th St",
"Chicago Ave",
"E Chicago Ave",
"W Chicago Ave"
),
city == "Chicago, IL"
) %>%
collect()
## Warning in .local(conn, statement, ...): Decimal MySQL column 12 imported
## as numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 13 imported
## as numeric
saveRDS(event_data, file = "event_data.rds")
event_data <- readRDS(file = "event_data.rds")
dbListConnections(MySQL())
## [[1]]
## <MySQLConnection:0,1>
lapply(dbListConnections(MySQL()), function(x) dbDisconnect(x))
## [[1]]
## [1] TRUE
event_data %>%
group_by(street) %>%
summarize(n())
## # A tibble: 5 x 2
## street `n()`
## <chr> <int>
## 1 Chicago Ave 6
## 2 E 79th St 37827
## 3 E Chicago Ave 2078
## 4 W 79th St 7227
## 5 W Chicago Ave 29926
event_data <- event_data %>%
mutate(
"corridor" = ifelse(street %in% c("E 79th St", "W 79th St"),
"79th St",
"Chicago Ave"
),
event_date_time = as.POSIXct(pubMillis / 1000, origin = "1970-01-01")
)
event_data %>%
filter(type %in% c("JAM", "ACCIDENT")) %>%
ggplot(aes(hour(event_date_time), fill = corridor)) +
geom_bar(position = "dodge")
connection <- DBI::dbConnect(RMySQL::MySQL(),
user = "ppha30531",
dbname = "Waze2",
port = 3306,
password = "bUYjwnKXf49M2pb",
host = "uchicagowazereplica2.cfykgneqoh8w.us-west-2.rds.amazonaws.com"
)
chi_alerts_sql <- tbl(connection, "chiAlerts")
## Warning in .local(conn, statement, ...): Decimal MySQL column 12 imported
## as numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 13 imported
## as numeric
event_data_comparison <- chi_alerts_sql %>%
filter(
street %in% c(
"E 87th St",
"W 87th St",
"E Division St",
"W Division St",
"Division St"
),
city == "Chicago, IL"
) %>%
collect()
## Warning in .local(conn, statement, ...): Decimal MySQL column 12 imported
## as numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 13 imported
## as numeric
saveRDS(event_data_comparison, file = "event_data.rds")
event_data_comparison <- readRDS(file = "event_data.rds")
dbListConnections(MySQL())
## [[1]]
## <MySQLConnection:0,2>
lapply(dbListConnections(MySQL()), function(x) dbDisconnect(x))
## [[1]]
## [1] TRUE
event_data_comparison %>%
group_by(street) %>%
summarize(n())
## # A tibble: 4 x 2
## street `n()`
## <chr> <int>
## 1 E 87th St 1043
## 2 E Division St 48
## 3 W 87th St 6505
## 4 W Division St 21524
event_data_comparison <- event_data_comparison %>%
mutate(
"corridor" = ifelse(street %in% c("E 87th St", "W 87th St"),
"87th St",
"Division St"),
event_date_time = as.POSIXct(pubMillis / 1000, origin = "1970-01-01")
)
ggplot(
event_data_comparison,
aes(hour(event_date_time),
fill = corridor
)
) +
geom_bar(position = "dodge")
a. Looking beyond traffic, what other alerts are very common in this area? Do you think these alerts would slow down the 66 / 79? If so, what steps could the City take to address the issues?
filter(event_data, type != "ROAD_CLOSED") %>%
ggplot(aes(hour(event_date_time),
fill = type
)) +
geom_bar(position = "dodge")
Besides traffic jams, weather hazards and accidents are also common. These would definitely slow down traffic for the buses. There also seems to be a case where 79th street was closed, triggering many event reports.
connection <- DBI::dbConnect(RMySQL::MySQL(),
user = "ppha30531",
dbname = "Waze2",
port = 3306,
password = "bUYjwnKXf49M2pb",
host = "uchicagowazereplica2.cfykgneqoh8w.us-west-2.rds.amazonaws.com"
)
chi_alerts_sql <- tbl(connection, "chiAlerts")
## Warning in .local(conn, statement, ...): Decimal MySQL column 12 imported
## as numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 13 imported
## as numeric
single_event_data <- chi_alerts_sql %>%
filter(uuid == "c5a73cc6-5242-3172-be5a-cf8990d70cb2") %>%
collect()
## Warning in .local(conn, statement, ...): Decimal MySQL column 12 imported
## as numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 13 imported
## as numeric
saveRDS(single_event_data, file = "single_event_data.rds")
single_event_data <- readRDS(file = "single_event_data.rds")
dbListConnections(MySQL())
## [[1]]
## <MySQLConnection:0,3>
lapply(dbListConnections(MySQL()), function(x) dbDisconnect(x))
## [[1]]
## [1] TRUE
single_event_data
## # A tibble: 4 x 17
## country nTHumbsUp city reportRating confidence reliability type uuid
## <chr> <int> <chr> <int> <int> <int> <chr> <chr>
## 1 US 0 Chic… 4 0 5 JAM c5a7…
## 2 US 0 Chic… 4 0 5 JAM c5a7…
## 3 US 0 Chic… 4 0 5 JAM c5a7…
## 4 US 0 Chic… 4 0 5 JAM c5a7…
## # … with 9 more variables: roadType <int>, magvar <int>, subtype <chr>,
## # street <chr>, location_x <dbl>, location_y <dbl>, pubMillis <dbl>,
## # reportDescription <chr>, scrape_dt <chr>
convert_to_millis <- function(time) {
time <- ymd_hms(time, tz = "America/Chicago")
duration <- as.duration(time - ymd_hms("1970-01-01 00:00:00"))
duration * 1000
}
connection <- DBI::dbConnect(RMySQL::MySQL(),
user = "ppha30531",
dbname = "Waze2",
port = 3306,
password = "bUYjwnKXf49M2pb",
host = "uchicagowazereplica2.cfykgneqoh8w.us-west-2.rds.amazonaws.com"
)
chi_alerts_sql <- tbl(connection, "chiAlerts")
## Warning in .local(conn, statement, ...): Decimal MySQL column 12 imported
## as numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 13 imported
## as numeric
time_range <- c(
start = convert_to_millis("2017-12-24 00:00:00"),
end = convert_to_millis("2017-12-25 00:00:00")
)
time_range
## start end
## 1.514095e+12 1.514182e+12
# Event date: 2017-12-24 12:02:55 CST
single_event_data <- chi_alerts_sql %>%
filter(
1514095200000 <= pubMillis, pubMillis <= 1514181600000,
-87.624138 <= location_x & location_x <= -87.6,
41.855 <= location_y & location_y <= 41.89
) %>%
collect()
## Warning in .local(conn, statement, ...): Decimal MySQL column 12 imported
## as numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 13 imported
## as numeric
#-87.624138 <= location_x, location_x <= -87.6,
# 41.855 <= location_y, location_y <= 41.89,
# Save collected data as an rds file, access and read it
saveRDS(single_event_data, file = "single_event_data.rds")
single_event_data <- readRDS(file = "single_event_data.rds")
dbListConnections(MySQL())
## [[1]]
## <MySQLConnection:0,4>
lapply(dbListConnections(MySQL()), function(x) dbDisconnect(x))
## [[1]]
## [1] TRUE
single_event_data <- single_event_data %>%
mutate(event_date_time = force_tz(
as.POSIXct(pubMillis / 1000, origin = "1970-01-01")),
"America/Chicago")
# OlsonNames() "America/Chicago"
single_event_bounds <- c(
left = -87.624138, bottom = 41.855, right =
-87.6, top = 41.89
)
single_event_stamenmap <- get_stamenmap(
data = single_event_data,
bbox = single_event_bounds,
zoom = 17,
maptype = "toner-lite"
)
## Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
## 162 tiles needed, this may take a while (try a smaller zoom).
single_event_map <- ggmap(single_event_stamenmap,
base_layer = ggplot(data = single_event_data)
)
single_event_map +
geom_point(aes(location_x, location_y, color = hour(event_date_time)))
What causes all these jams? Some googling might help. The jams are caused by game-day traffic from Soldiers Field (which is the oval to the right of Lake Shore Drive in the map)
Plot the number of jams 6AM-6PM CST. Why are there two humps?
single_event_data %>%
filter(
type == "JAM",
hour(event_date_time) %in% c(6:18)
) %>%
ggplot(aes(event_date_time)) +
geom_freqpoly(bins = 52)
There are 2 humps at (about) 11:45 AM and 3:30 PM because there was a football game at Soldiers field that day, which caused traffic on Lake Shore Drive. The game on Dec. 24 started at 1:00 PM and lasted until about 4:00 PM. These start and end times match up with the two peaks.
Interestingly, the graph suggests that fans may have started leaving before the end of the game, probably because it wasn’t a competitive game and it was a cold day with temperatures in the mid-20s. In the game’s 3rd quarter, the Bears ran up a 20-3 lead on the Browns, who had also lost every game that season (and ended up with a 0-15 season). Many fans prefer to leave early during a non-competitive game to “beat traffic.” So it’s not surprising that the peak would start prior to the actual end of the game.
https://www.pro-football-reference.com/boxscores/201712240chi.htm https://www.timeanddate.com/weather/usa/chicago/historic?month=12&year=2017 a. Place one vertical line at each hump.
single_event_data %>%
filter(
type == "JAM",
hour(event_date_time) %in% c(6:18)
) %>%
ggplot(aes(event_date_time)) +
geom_freqpoly(bins = 52) +
geom_vline(xintercept = ymd_hms("2017-Dec-24 11:50:00",
tz = "America/Chicago")) +
geom_vline(xintercept = ymd_hms("2017-Dec-24 15:20:00",
tz = "America/Chicago"))
JAM alerts with information in the subtype variable.single_event_data <- single_event_data %>%
mutate(jam_severity = case_when(
subtype == "JAM_LIGHT_TRAFFIC" ~ 1,
subtype == "JAM_MODERATE_TRAFFIC" ~ 2,
subtype == "JAM_HEAVY_TRAFFIC" ~ 3,
subtype == "JAM_STAND_STILL_TRAFFIC" ~ 4
))
# https://stackoverflow.com/questions/24459752/can-dplyr-package-be-used-for-conditional-mutating
single_event_data %>%
filter(
type == "JAM",
hour(event_date_time) %in% c(6:18)
) %>%
ggplot(aes(hour(event_date_time), jam_severity), jam_severity) +
geom_col()
## Warning: Removed 41 rows containing missing values (position_stack).
The timing of the peaks don’t change. But based on the severity measure, the 11:00 AM traffic is actually worse than the 3:00 PM traffic. This was not apparent from the previous graph, which made it look like there was more traffic at 3:00 PM.
connection <- DBI::dbConnect(RMySQL::MySQL(),
user = "ppha30531",
dbname = "Waze2",
port = 3306,
password = "bUYjwnKXf49M2pb",
host = "uchicagowazereplica2.cfykgneqoh8w.us-west-2.rds.amazonaws.com"
)
chi_alerts_sql <- tbl(connection, "chiAlerts")
## Warning in .local(conn, statement, ...): Decimal MySQL column 12 imported
## as numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 13 imported
## as numeric
# Event PubMillis 1511191459107
1511191459107 + c(-7200000, 7200000)
## [1] 1.511184e+12 1.511199e+12
# https://gis.stackexchange.com/questions/142326/calculating-longitude-length-in-miles
# Each degree of latitude is approximately 69 miles (111 kilometers) apart
# -87.66080 42.00315
-87.66080 + c(-.1 / 69, .1 / 69)
## [1] -87.66225 -87.65935
42.00315 + c(-.1 / 69, .1 / 69)
## [1] 42.0017 42.0046
multiple_event_data <- chi_alerts_sql %>%
filter(
-87.66225 <= location_x & location_x <= -87.65935,
42.0017 <= location_y & location_y <= 42.0046,
1511184259107 <= pubMillis, pubMillis <= 1511198659107
) %>%
collect()
## Warning in .local(conn, statement, ...): Decimal MySQL column 12 imported
## as numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 13 imported
## as numeric
saveRDS(multiple_event_data, file = "multiple_event_data.rds")
multiple_event_data <- readRDS(file = "multiple_event_data.rds")
dbListConnections(MySQL())
## [[1]]
## <MySQLConnection:0,5>
lapply(dbListConnections(MySQL()), function(x) dbDisconnect(x))
## [[1]]
## [1] TRUE
multiple_event_data <- multiple_event_data %>%
mutate(event_date_time = as.POSIXct(pubMillis / 1000,
origin = "1970-01-01",
tz = "America/Chicago"))
multiple_event_bounds <- c(
left = -87.66225, bottom = 42.0017, right =
-87.65935, top = 42.0046
)
multiple_event_stamenmap <- get_stamenmap(
data = multiple_event_data,
bbox = multiple_event_bounds,
zoom = 18,
maptype = "toner-lite"
)
## Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
multiple_event_map <- ggmap(multiple_event_stamenmap,
base_layer = ggplot(data = multiple_event_data)
)
multiple_event_map +
geom_point(aes(location_x, location_y, shape = subtype))
Make a plot where the y-axis is the number of traffic jam alerts and the x-axis is the five-minute interval from two hours before the accident to two hours after the accident. Warning: This question is harder than it first appears. You might want to review R4DS chapter 12.5 (lecture note 5) on missing values and chapter 16.4 (lecture note 9).
convert_to_millis <- function(time) {
duration <- as.duration(time - ymd_hms("1970-01-01 00:00:00"))
as.numeric(duration * 1000)
}
categories <- c(-24:24)
multiple_event_data %>%
mutate(
event_epoch_millis = pubMillis - convert_to_millis(event_date_time),
bucket = factor(event_epoch_millis %/% 300000, levels = categories)
) %>%
group_by(bucket) %>%
summarize(count = n()) %>%
complete(bucket, fill = list(count = 0)) %>%
ggplot(aes(bucket, count)) +
geom_col() +
labs(
title = "Timing of Events around Major Accidents",
x = "Minutes Before/After Major Accident (5 min increment)",
y = "Mean Number of Events"
) +
coord_flip()
uuid, a date-time, a latitude and a longitude and returns a data frame with the number of alerts in each five-minute interval from two hours before to two hours after.get_alert_summary <- function(uuid, date_time, latitude, longitude) {
connection <- DBI::dbConnect(RMySQL::MySQL(),
user = "ppha30531",
dbname = "Waze2",
port = 3306,
password = "bUYjwnKXf49M2pb",
host = "uchicagowazereplica2.cfykgneqoh8w.us-west-2.rds.amazonaws.com"
)
chi_alerts_sql <- tbl(connection, "chiAlerts")
time_min <- convert_to_millis(date_time) - 7200000
time_max <- convert_to_millis(date_time) + 7200000
x_min <- longitude - .1 / 69
x_max <- longitude + .1 / 69
y_min <- latitude - .1 / 69
y_max <- latitude + .1 / 69
multiple_event_data <- chi_alerts_sql %>%
filter(
time_min <= pubMillis, pubMillis <= time_max,
x_min <= location_x, location_x <= x_max,
y_min <= location_y, location_y <= y_max
) %>%
collect()
saveRDS(multiple_event_data, file = "multiple_event_data.rds")
multiple_event_data <- readRDS(file = "multiple_event_data.rds")
lapply(dbListConnections(MySQL()), function(x) dbDisconnect(x))
categories <- c(-24:24)
multiple_event_data %>%
mutate(
event_epoch_millis = pubMillis - convert_to_millis(date_time),
bucket = factor(event_epoch_millis %/% 300000, levels = categories)
) %>%
group_by(bucket) %>%
summarize(count = n()) %>%
complete(bucket, fill = list(count = 0))
}
get_alert_summary(
"df0ae6f2-22ae-3977-8168-0290610801f9",
as.POSIXct(1511837969275 / 1000, origin = "1970-01-01"),
41.8421,
-87.61015
)
## # A tibble: 49 x 2
## bucket count
## <fct> <dbl>
## 1 -24 0
## 2 -23 0
## 3 -22 0
## 4 -21 0
## 5 -20 0
## 6 -19 0
## 7 -18 0
## 8 -17 0
## 9 -16 0
## 10 -15 0
## # … with 39 more rows
connection <- DBI::dbConnect(RMySQL::MySQL(),
user = "ppha30531",
dbname = "Waze2",
port = 3306,
password = "bUYjwnKXf49M2pb",
host = "uchicagowazereplica2.cfykgneqoh8w.us-west-2.rds.amazonaws.com"
)
chi_alerts_sql <- tbl(connection, "chiAlerts")
multiple_event_data <- chi_alerts_sql %>%
filter(subtype == "ACCIDENT_MAJOR") %>%
collect()
saveRDS(multiple_event_data, file = "multiple_event_data.rds")
multiple_event_data <- readRDS(file = "multiple_event_data.rds")
dbListConnections(MySQL())
## [[1]]
## <MySQLConnection:0,7>
lapply(dbListConnections(MySQL()), function(x) dbDisconnect(x))
## [[1]]
## [1] TRUE
multiple_event_data <- multiple_event_data %>%
mutate(event_date_time = as.POSIXct(pubMillis / 1000,
origin = "1970-01-01",
tz = "America/Chicago")) %>%
filter(date(event_date_time) == ymd("2017-11-20")) %>%
distinct(uuid, .keep_all = TRUE)
# http://www.datasciencemadesimple.com/remove-duplicate-rows-r-using-dplyr-distinct-function/
subset <- multiple_event_data %>% head(5)
test <- list(
"df0ae6f2-22ae-3977-8168-0290610801f9",
as.POSIXct(1511837969275 / 1000, origin = "1970-01-01"),
41.8421,
-87.61015
)
as.POSIXct(1511837969275 / 1000, origin = "1970-01-01")
## [1] "2017-11-27 20:59:29 CST"
subset_test <- ""
alert_summary <- ""
for (i in seq_along(1:nrow(multiple_event_data))) {
alert_summary[i] <- list(get_alert_summary(
uuid = multiple_event_data$uuid[i],
date_time = multiple_event_data$event_date_time[i],
latitude = multiple_event_data$location_y[i],
longitude = multiple_event_data$location_x[i]
))
}
combined_table <- colnames(alert_summary)
for (i in seq_along(1:length(alert_summary))) {
combined_table <- bind_rows(combined_table, alert_summary[[i]])
}
full_event_summary <- combined_table %>%
group_by(bucket) %>%
summarize(mean_events = mean(count))
ggplot(full_event_summary, aes(x = bucket, y = mean_events)) +
geom_col() +
labs(
title = "Timing of Events around Major Accidents",
x = "Minutes Before/After Major Accident (5 min increment)",
y = "Mean Number of Events"
) +
coord_flip()